Data Preparation¶

Importing Libraries¶

In [ ]:
import pandas as pd
# ensure openpyxl==3.1.0, version 3.1.1 has a bug
In [ ]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import os

Downloading Data¶

In [ ]:
MajorRoadsEmissionsURL = 'https://data.london.gov.uk/download/london-atmospheric-emissions-inventory-2013/8c520de2-c518-4e64-932c-0071ac826742/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx'
SupportingDocURL = 'https://data.london.gov.uk/download/london-atmospheric-emissions-inventory-2013/1f3755f3-dae8-4d39-b9b4-0cc7adb4e826/1.%20Supporting%20Information.zip'
LocalFileMajorRoadsEmissions = 'data/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx'
LocalFileSupportingDoc = 'data/1. Supporting Information.zip'
In [ ]:
# Check if data folder exists, if not create it
if not os.path.exists('data'):
    os.makedirs('data')

# Check if data is already downloaded, if not set load_from_local to False
if os.path.exists(LocalFileMajorRoadsEmissions):
    load_from_local = True
else:
    load_from_local = False

if load_from_local:
    MajorRoadsEmissionsURL = LocalFileMajorRoadsEmissions 
xls = pd.ExcelFile(MajorRoadsEmissionsURL)
In [ ]:
if load_from_local:
    DocZip = ZipFile(LocalFileSupportingDoc)
else:
    resp = urlopen(SupportingDocURL)
    DocZip = ZipFile(BytesIO(resp.read()))
DocZip.namelist()[:10]
Out[ ]:
['1. Supporting Information/',
 '1. Supporting Information/1. Road Traffic Data/',
 '1. Supporting Information/1. Road Traffic Data/Excel/',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2008_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2010_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2013_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2020_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2025_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2030_AADT-VKM.xlsx',
 '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_VKM_AllYears_Summary.xlsx']

Load Data into dataframes¶

Road Emissions Data¶

In [ ]:
xls.sheet_names
Out[ ]:
['2013 LTS Rds', '2013 Other Major Rds']
In [ ]:
df_LTSroads = pd.read_excel(xls, '2013 LTS Rds')
print(df_LTSroads.shape)
df_LTSroads.head()
(366220, 32)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut BoroughName_ExactCut Lts Length (m) Emissions Year Pollutant ... Artic5Axle Artic6Axle PetrolCar DieselCar PetrolLgv DieselLgv LtBus Coach ElectricCar ElectricLgv
0 6253 4000000027908919 24 External NonGLA 18898 50.761449 DFT 2013 CO2 ... 0.241372 0.190560 8.761443 4.810774 0.037550 1.735121 0.0 0.0 0.0 0.0
1 6253 4000000027947931 24 External NonGLA 18895 28.592125 DFT 2013 CO2 ... 0.000000 0.000000 0.015535 0.008576 0.000000 0.000000 0.0 0.0 0.0 0.0
2 6253 4000000028013383 24 External NonGLA 15816 5.101391 DFT 2013 CO2 ... 0.027271 0.021509 0.939028 0.518684 0.004055 0.184415 0.0 0.0 0.0 0.0
3 6253 4000000028025820 24 External NonGLA 15816 3.757501 DFT 2013 CO2 ... 0.020087 0.015843 0.691654 0.382044 0.002987 0.135834 0.0 0.0 0.0 0.0
4 6253 4000000028029388 24 External NonGLA 15816 1.624593 DFT 2013 CO2 ... 0.008685 0.006850 0.299044 0.165180 0.001292 0.058729 0.0 0.0 0.0 0.0

5 rows × 32 columns

In [ ]:
df_OtherMajorRoads = pd.read_excel(xls, '2013 Other Major Rds')
print(df_OtherMajorRoads.shape)
df_OtherMajorRoads.head()
(513740, 32)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut BoroughName_ExactCut DotRef Length (m) Emissions Year Pollutant ... Artic5Axle Artic6Axle PetrolCar DieselCar PetrolLgv DieselLgv LtBus Coach ElectricCar ElectricLgv
0 5911 4000000027989878 2 External NonGLA 28440 9.714495 DFT 2013 CO2 ... 3.006694 12.549219 18.791658 19.630267 0.279151 11.005820 0.000000 0.744254 0.0 0.0
1 5911 4000000027989880 2 External NonGLA 28440 0.000000 DFT 2013 CO2 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0
2 5911 4000000027989882 2 External NonGLA 57226 8.577192 DFT 2013 CO2 ... 0.760333 2.446611 19.478135 10.300493 0.120149 7.734197 0.754408 0.868990 0.0 0.0
3 5911 4000000028014332 2 External NonGLA 57226 9.347936 DFT 2013 CO2 ... 0.823130 2.648621 20.173154 10.553940 0.123945 7.418739 0.820669 0.897038 0.0 0.0
4 5911 4000000027888882 2 External NonGLA 28440 0.000000 DFT 2013 CO2 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0

5 rows × 32 columns

In [ ]:
# concat  the two dataframes
df_RoadEmissions = pd.concat([df_LTSroads, df_OtherMajorRoads], ignore_index=True)
print(df_RoadEmissions.shape)
df_RoadEmissions.head()
(879960, 33)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut BoroughName_ExactCut Lts Length (m) Emissions Year Pollutant ... Artic6Axle PetrolCar DieselCar PetrolLgv DieselLgv LtBus Coach ElectricCar ElectricLgv DotRef
0 6253 4000000027908919 24 External NonGLA 18898.0 50.761449 DFT 2013 CO2 ... 0.190560 8.761443 4.810774 0.037550 1.735121 0.0 0.0 0.0 0.0 NaN
1 6253 4000000027947931 24 External NonGLA 18895.0 28.592125 DFT 2013 CO2 ... 0.000000 0.015535 0.008576 0.000000 0.000000 0.0 0.0 0.0 0.0 NaN
2 6253 4000000028013383 24 External NonGLA 15816.0 5.101391 DFT 2013 CO2 ... 0.021509 0.939028 0.518684 0.004055 0.184415 0.0 0.0 0.0 0.0 NaN
3 6253 4000000028025820 24 External NonGLA 15816.0 3.757501 DFT 2013 CO2 ... 0.015843 0.691654 0.382044 0.002987 0.135834 0.0 0.0 0.0 0.0 NaN
4 6253 4000000028029388 24 External NonGLA 15816.0 1.624593 DFT 2013 CO2 ... 0.006850 0.299044 0.165180 0.001292 0.058729 0.0 0.0 0.0 0.0 NaN

5 rows × 33 columns

Traffic Data¶

In [ ]:
traffic_excel = DocZip.read('1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2013_AADT-VKM.xlsx')
traffic_xls = pd.ExcelFile(BytesIO(traffic_excel))
In [ ]:
df_AADT = pd.read_excel(traffic_xls, 'MajorGrid_AADTandVKM_2013')
print(df_AADT.shape)
df_AADT.head()
(87999, 44)
Out[ ]:
RowID Year Toid GRID_ExactCut_ID Location_ExactCut BoroughName_ExactCut TLRN MotorwayNumber AADT Motorcycle AADT Taxi ... VKM_Coach VKM_Rigid2Axle VKM_Rigid3Axle VKM_Rigid4Axle VKM_Artic3Axle VKM_Artic5Axle VKM_Artic6Axle VKM_ElectricCar VKM_ElectricLgv VKM_TOTAL
0 1.0 2013.0 4.000000e+15 836.0 Outer Hillingdon Other Other 88.301916 77.112580 ... 149.248696 293.680300 55.978941 39.030966 16.191367 10.970609 3.993946 4.335614 1.235289 16605.011414
1 2.0 2013.0 4.000000e+15 2217.0 Outer Hillingdon Other Other 88.301916 77.112580 ... 98.338925 193.503902 36.884134 25.717231 10.668379 7.228458 2.631583 2.856706 0.813924 10940.494996
2 3.0 2013.0 4.000000e+15 282.0 External NonGLA Other Other 310.363572 100.322495 ... 1657.075319 12950.212101 3011.364039 2861.551314 1710.809301 1966.897025 1647.110606 221.806380 47.635028 796735.125068
3 4.0 2013.0 4.000000e+15 873.0 Outer Hillingdon Other Other 39.473081 144.548284 ... 118.008843 9777.985094 2051.227418 1024.275647 470.758531 815.631678 1959.389833 78.775616 15.287825 284144.265992
4 5.0 2013.0 4.000000e+15 2930.0 Outer Hillingdon Other Other 39.473081 144.548284 ... 401.216526 33244.027352 6973.937855 3482.419671 1600.524988 2773.054115 6661.700602 267.828056 51.976850 966057.900401

5 rows × 44 columns

Join the dataframes¶

In [ ]:
# left join df_RoadEmissions and df_AADT on the ['Toid' and 'GRID_ExactCut_ID'] column
df = pd.merge(df_RoadEmissions, df_AADT, how='left',
            left_on=['Toid', 'GRID_ExactCut_ID'],
            right_on=['Toid', 'GRID_ExactCut_ID'])
print(df.shape)
df.head()
(879960, 75)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut_x BoroughName_ExactCut_x Lts Length (m)_x Emissions Year_x Pollutant ... VKM_Coach VKM_Rigid2Axle VKM_Rigid3Axle VKM_Rigid4Axle VKM_Artic3Axle VKM_Artic5Axle VKM_Artic6Axle VKM_ElectricCar VKM_ElectricLgv VKM_TOTAL
0 6253 4000000027908919 24 External NonGLA 18898.0 50.761449 DFT 2013 CO2 ... 0.0 2383.880434 229.558857 335.509098 194.242109 247.217230 176.583736 28.990862 5.460174 104036.993985
1 6253 4000000027947931 24 External NonGLA 18895.0 28.592125 DFT 2013 CO2 ... 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.046589 0.000000 140.050860
2 6253 4000000028013383 24 External NonGLA 15816.0 5.101391 DFT 2013 CO2 ... 0.0 239.573680 23.070058 33.717777 19.520818 24.844678 17.746199 2.913505 0.548733 10455.442789
3 6253 4000000028025820 24 External NonGLA 15816.0 3.757501 DFT 2013 CO2 ... 0.0 176.461358 16.992575 24.835302 14.378333 18.299696 13.071212 2.145983 0.404177 7701.103189
4 6253 4000000028029388 24 External NonGLA 15816.0 1.624593 DFT 2013 CO2 ... 0.0 76.294807 7.346907 10.737788 6.216614 7.912054 5.651467 0.927837 0.174750 3329.647849

5 rows × 75 columns

In [ ]:
# remove unnecessary columns
columnstokeep = df.columns
for a in columnstokeep:
    if 'VKM' in a:
        columnstokeep = columnstokeep.drop(a)
df = df[columnstokeep]
print(df.shape)
df.head()
(879960, 58)
Out[ ]:
GridId Toid GRID_ExactCut_ID Location_ExactCut_x BoroughName_ExactCut_x Lts Length (m)_x Emissions Year_x Pollutant ... AADT Rigid3Axle AADT Rigid4Axle AADT Artic3Axle AADT Artic5Axle AADT Artic6Axle AADT ElectricCar AADT ElectricLgv AADT TOTAL Speed (kph) Length (m)_y
0 6253 4000000027908919 24 External NonGLA 18898.0 50.761449 DFT 2013 CO2 ... 12.389882 18.108290 10.483747 13.342950 9.530679 1.564711 0.29470 5615.144325 42.269546 50.761449
1 6253 4000000027947931 24 External NonGLA 18895.0 28.592125 DFT 2013 CO2 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.004464 0.00000 13.419814 32.236053 28.592125
2 6253 4000000028013383 24 External NonGLA 15816.0 5.101391 DFT 2013 CO2 ... 6.194941 9.054145 5.241873 6.671475 4.765339 0.782356 0.14735 2807.572163 35.051885 10.202783
3 6253 4000000028025820 24 External NonGLA 15816.0 3.757501 DFT 2013 CO2 ... 6.194941 9.054145 5.241873 6.671475 4.765339 0.782356 0.14735 2807.572163 35.051885 7.515003
4 6253 4000000028029388 24 External NonGLA 15816.0 1.624593 DFT 2013 CO2 ... 6.194941 9.054145 5.241873 6.671475 4.765339 0.782356 0.14735 2807.572163 35.051885 3.249186

5 rows × 58 columns

In [ ]:
df.columns
Out[ ]:
Index(['GridId', 'Toid', 'GRID_ExactCut_ID', 'Location_ExactCut_x',
       'BoroughName_ExactCut_x', 'Lts', 'Length (m)_x', 'Emissions', 'Year_x',
       'Pollutant', 'Emissions Unit', 'Motorcycle', 'Taxi', 'Car',
       'BusAndCoach', 'Lgv', 'Rigid', 'Artic', 'Rigid2Axle', 'Rigid3Axle',
       'Rigid4Axle', 'Artic3Axle', 'Artic5Axle', 'Artic6Axle', 'PetrolCar',
       'DieselCar', 'PetrolLgv', 'DieselLgv', 'LtBus', 'Coach', 'ElectricCar',
       'ElectricLgv', 'DotRef', 'RowID', 'Year_y', 'Location_ExactCut_y',
       'BoroughName_ExactCut_y', 'TLRN', 'MotorwayNumber', 'AADT Motorcycle',
       'AADT Taxi', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv',
       'AADT LtBus', 'AADT Coach', 'AADT Rigid2Axle', 'AADT Rigid3Axle',
       'AADT Rigid4Axle', 'AADT Artic3Axle', 'AADT Artic5Axle',
       'AADT Artic6Axle', 'AADT ElectricCar', 'AADT ElectricLgv', 'AADT TOTAL',
       'Speed (kph)', 'Length (m)_y'],
      dtype='object')

Classify columns to labels and targets¶

In [ ]:
label = []
columnstokeep = df.columns
for a in columnstokeep:
    if 'AADT' in a:
        label = label + [a]
    elif 'Speed' in a:
        label = label + [a]
    elif 'Length' in a:
        label = label + [a]
label
Out[ ]:
['Length (m)_x',
 'AADT Motorcycle',
 'AADT Taxi',
 'AADT Pcar',
 'AADT Dcar',
 'AADT PLgv',
 'AADT DLgv',
 'AADT LtBus',
 'AADT Coach',
 'AADT Rigid2Axle',
 'AADT Rigid3Axle',
 'AADT Rigid4Axle',
 'AADT Artic3Axle',
 'AADT Artic5Axle',
 'AADT Artic6Axle',
 'AADT ElectricCar',
 'AADT ElectricLgv',
 'AADT TOTAL',
 'Speed (kph)',
 'Length (m)_y']
In [ ]:
# remove irrelevant columns
label.remove('AADT TOTAL')
label.remove('Speed (kph)')
label.remove('Length (m)_y')
label
Out[ ]:
['Length (m)_x',
 'AADT Motorcycle',
 'AADT Taxi',
 'AADT Pcar',
 'AADT Dcar',
 'AADT PLgv',
 'AADT DLgv',
 'AADT LtBus',
 'AADT Coach',
 'AADT Rigid2Axle',
 'AADT Rigid3Axle',
 'AADT Rigid4Axle',
 'AADT Artic3Axle',
 'AADT Artic5Axle',
 'AADT Artic6Axle',
 'AADT ElectricCar',
 'AADT ElectricLgv']
In [ ]:
target = ['Motorcycle', 'Taxi', 'Car',
          'BusAndCoach', 'Lgv', 'Rigid', 'Artic', 'Rigid2Axle', 'Rigid3Axle',
          'Rigid4Axle', 'Artic3Axle', 'Artic5Axle', 'Artic6Axle', 'PetrolCar',
          'DieselCar', 'PetrolLgv', 'DieselLgv', 'LtBus', 'Coach', 'ElectricCar',
          'ElectricLgv']

Visualize the data¶

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# plot the correlation matrix
corr = df[label + target].corr()
plt.figure(figsize=(30,30))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.show()

Derive the target¶

In [ ]:
# sum the target variables to one column
df['Total Emissions'] = df[target].sum(axis=1)

Split data for each type of pollutants¶

In [ ]:
# Split df by unique 'Pollutant' values into dictionary
df_dict = {k: v for k, v in df.groupby('Pollutant')}
df_dict.keys()
Out[ ]:
dict_keys(['CO2', 'NOx', 'PM10_Brake', 'PM10_Exhaust', 'PM10_Resusp', 'PM10_Tyre', 'PM25_Brake', 'PM25_Exhaust', 'PM25_Resusp', 'PM25_Tyre'])
In [ ]:
for k, v in df_dict.items():
    print(k, v.shape)
CO2 (87996, 59)
NOx (87996, 59)
PM10_Brake (87996, 59)
PM10_Exhaust (87996, 59)
PM10_Resusp (87996, 59)
PM10_Tyre (87996, 59)
PM25_Brake (87996, 59)
PM25_Exhaust (87996, 59)
PM25_Resusp (87996, 59)
PM25_Tyre (87996, 59)

Split the data into train and test¶

In [ ]:
from sklearn.model_selection import train_test_split

train_test_set = {}

for k, v in df_dict.items():
    x_train, x_test, y_train, y_test = train_test_split(v[label], v['Total Emissions'], test_size=0.2, random_state=42)
    # drop indexes
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    train_test_set[k] = {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}

train_test_set.keys()
Out[ ]:
dict_keys(['CO2', 'NOx', 'PM10_Brake', 'PM10_Exhaust', 'PM10_Resusp', 'PM10_Tyre', 'PM25_Brake', 'PM25_Exhaust', 'PM25_Resusp', 'PM25_Tyre'])

Feature Selection¶

In [ ]:
from sklearn.feature_selection import SelectKBest, f_regression
In [ ]:
# feature selection
for k, v in train_test_set.items():
    selector = SelectKBest(f_regression, k=10)
    selector.fit(v['x_train'], v['y_train'])
    v['x_train'] = selector.transform(v['x_train'])
    v['x_test'] = selector.transform(v['x_test']) 
    v['selected_features'] = [label[i] for i in selector.get_support(indices=True)]
    v['x_train'] = pd.DataFrame(v['x_train'], columns=v['selected_features'])
    v['x_test'] = pd.DataFrame(v['x_test'], columns=v['selected_features'])
    v['selector'] = selector
    print(k, v['selected_features'])
    
CO2 ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
NOx ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM10_Brake ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM10_Exhaust ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM10_Resusp ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM10_Tyre ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM25_Brake ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM25_Exhaust ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM25_Resusp ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
PM25_Tyre ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
In [ ]:
# rank the selected features by their scores and pvalues using selector
for k, v in train_test_set.items():
    print(k)
    for i in range(len(v['selected_features'])):
        print(v['selected_features'][i],
              v['selector'].scores_[i],
              v['selector'].pvalues_[i])
CO2
Length (m)_x 23902.929633754928 0.0
AADT Pcar 1336.9976842095416 5.494646680566392e-290
AADT Dcar 582.8689772096944 2.971227847243228e-128
AADT PLgv 22267.173089016138 0.0
AADT DLgv 41915.93607963614 0.0
AADT Rigid2Axle 42142.24283493276 0.0
AADT Rigid3Axle 29262.938191929497 0.0
AADT Artic3Axle 46.29843847699099 1.0235382383240951e-11
AADT Artic5Axle 2794.4960280143246 0.0
AADT Artic6Axle 20252.94443209883 0.0
NOx
Length (m)_x 25199.00406728696 0.0
AADT Pcar 1933.640411861189 0.0
AADT Dcar 953.1646645612719 6.706565319175453e-208
AADT PLgv 22384.543240906278 0.0
AADT DLgv 41900.15594616222 0.0
AADT Rigid2Axle 44386.849430008064 0.0
AADT Rigid3Axle 31181.185419332247 0.0
AADT Artic3Axle 25.324472931056917 4.857127545287284e-07
AADT Artic5Axle 3448.082301181072 0.0
AADT Artic6Axle 22482.45776086851 0.0
PM10_Brake
Length (m)_x 38683.2432146443 0.0
AADT Pcar 2962.7360692582956 0.0
AADT Dcar 1620.511587746007 0.0
AADT PLgv 25840.02640642982 0.0
AADT DLgv 30801.887803789727 0.0
AADT Rigid2Axle 29519.65403221131 0.0
AADT Rigid3Axle 27152.067570247164 0.0
AADT Artic3Axle 3.2804639882439957 0.07011336638544322
AADT Artic5Axle 3599.9202903149685 0.0
AADT Artic6Axle 20742.480764929813 0.0
PM10_Exhaust
Length (m)_x 21620.943593596217 0.0
AADT Pcar 1265.123428559489 1.190617668901583e-274
AADT Dcar 628.2292701958095 4.909081815610148e-138
AADT PLgv 21108.32428100092 0.0
AADT DLgv 40932.04712885946 0.0
AADT Rigid2Axle 42163.078322254085 0.0
AADT Rigid3Axle 28910.707151723433 0.0
AADT Artic3Axle 105.68849112772484 8.984237008445372e-25
AADT Artic5Axle 2763.8219202103915 0.0
AADT Artic6Axle 19334.615578500583 0.0
PM10_Resusp
Length (m)_x 17394.967006344326 0.0
AADT Pcar 860.9758350795003 4.093947432063161e-188
AADT Dcar 394.84024282225107 1.2733223918480454e-87
AADT PLgv 16106.274718291455 0.0
AADT DLgv 34182.723940202915 0.0
AADT Rigid2Axle 36921.14041658401 0.0
AADT Rigid3Axle 24070.9896754166 0.0
AADT Artic3Axle 30.76573481406677 2.9217653667944158e-08
AADT Artic5Axle 2360.5218555592533 0.0
AADT Artic6Axle 18755.936341716075 0.0
PM10_Tyre
Length (m)_x 27264.180259448123 0.0
AADT Pcar 1162.8543844906014 8.407983402639132e-253
AADT Dcar 478.53576055379347 1.0033082120536442e-105
AADT PLgv 24527.983872955032 0.0
AADT DLgv 44137.25820898399 0.0
AADT Rigid2Axle 42698.51136379059 0.0
AADT Rigid3Axle 30348.281981145126 0.0
AADT Artic3Axle 224.20872971314435 1.307524825797232e-50
AADT Artic5Axle 2761.8427120687165 0.0
AADT Artic6Axle 20362.10792590025 0.0
PM25_Brake
Length (m)_x 39563.24242949934 0.0
AADT Pcar 2914.575934461469 0.0
AADT Dcar 1613.656445389349 0.0
AADT PLgv 25650.05050439342 0.0
AADT DLgv 31220.19802930797 0.0
AADT Rigid2Axle 29622.614375626104 0.0
AADT Rigid3Axle 26880.292563898027 0.0
AADT Artic3Axle 3.5357668733612653 0.06006268045986792
AADT Artic5Axle 3655.427613131223 0.0
AADT Artic6Axle 20555.3213518843 0.0
PM25_Exhaust
Length (m)_x 21960.826750954293 0.0
AADT Pcar 1257.4778539588929 5.105697896084948e-273
AADT Dcar 597.5052107746775 2.069285930605026e-131
AADT PLgv 21293.37094202227 0.0
AADT DLgv 41605.56545776436 0.0
AADT Rigid2Axle 43043.95362958769 0.0
AADT Rigid3Axle 29398.472812733846 0.0
AADT Artic3Axle 112.02250933342329 3.696521863192289e-26
AADT Artic5Axle 2780.8012875752993 0.0
AADT Artic6Axle 19476.13679680666 0.0
PM25_Resusp
Length (m)_x 17508.076847041673 0.0
AADT Pcar 843.1614225062681 2.746024706505372e-184
AADT Dcar 362.6369905521536 1.198514413961006e-80
AADT PLgv 16046.454548760841 0.0
AADT DLgv 33989.14652901631 0.0
AADT Rigid2Axle 36907.81055888567 0.0
AADT Rigid3Axle 24060.85231687838 0.0
AADT Artic3Axle 35.86461673918812 2.1253685942018337e-09
AADT Artic5Axle 2297.4932265679067 0.0
AADT Artic6Axle 18541.778199961012 0.0
PM25_Tyre
Length (m)_x 27354.984411691832 0.0
AADT Pcar 1181.1029512249286 1.0559383455650056e-256
AADT Dcar 489.9179077726642 3.480721716445588e-108
AADT PLgv 24193.121503926428 0.0
AADT DLgv 44730.39588858886 0.0
AADT Rigid2Axle 43381.929381283895 0.0
AADT Rigid3Axle 30302.210534771802 0.0
AADT Artic3Axle 213.02996738096178 3.526501301225809e-48
AADT Artic5Axle 2734.7982145928704 0.0
AADT Artic6Axle 20291.496453043907 0.0
In [ ]:
# visualize correlation matrix of selected features
for k, v in train_test_set.items():
    corr = pd.DataFrame(v['x_train'], columns=v['selected_features']).corr()
    # set title of plt to k
    print(k)
    plt.figure(figsize=(10,10))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
    plt.show()
CO2
NOx
PM10_Brake
PM10_Exhaust
PM10_Resusp
PM10_Tyre
PM25_Brake
PM25_Exhaust
PM25_Resusp
PM25_Tyre

Standardize the data¶

In [ ]:
# standardize the df using sklearn standard scaler
from sklearn.preprocessing import StandardScaler
In [ ]:
# stardaize the x_train and x_test
for k, v in train_test_set.items():
    # initialize the scaler
    scaler = StandardScaler()
    # fit the scaler to the x_train
    scaler.fit(v['x_train'])
    # transform the x_train and x_test
    v['x_train'] = scaler.transform(v['x_train'])
    v['x_test'] = scaler.transform(v['x_test'])
    v['x_train'] = pd.DataFrame(v['x_train'], columns=v['selected_features'])
    v['x_test'] = pd.DataFrame(v['x_test'], columns=v['selected_features'])
    # save the scaler
    v['scaler'] = scaler

Model Development¶

Model Training¶

Linear Regression¶

In [ ]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
In [ ]:
# model set for each pollutant
LR_set = {}

for k, v in train_test_set.items():
    model = LinearRegression()
    model.fit(v['x_train'], v['y_train'])
    y_pred = model.predict(v['x_test'])
    r2 = r2_score(v['y_test'], y_pred)
    mse = mean_squared_error(v['y_test'], y_pred)
    rmse = np.sqrt(mse)
    LR_set[k] = {'model': model, 'y_pred': y_pred, 'mse': mse, 'r2': r2, 'rmse': rmse, 'y_test': v['y_test'], 'x_test': v['x_test'], 'x_train': v['x_train'], 'y_train': v['y_train']}
    print(k, mse, r2, rmse)
CO2 156343.23707624507 0.6228061024706681 395.40262654191497
NOx 1.3337240174376739 0.5887914721059816 1.1548696971683317
PM10_Brake 0.002064996161808893 0.581905620953646 0.04544222883848121
PM10_Exhaust 0.0008067369355814339 0.5964592889324114 0.028403114892233808
PM10_Resusp 0.007830744110917233 0.5904504796613461 0.08849149174308925
PM10_Tyre 0.00028900442231542513 0.650662925495141 0.017000130067603165
PM25_Brake 0.00035156196710148805 0.566541695180498 0.01874998578936763
PM25_Exhaust 0.0005769161710396386 0.5835788811077669 0.024019079312905367
PM25_Resusp 1.0696322799024033e-05 0.5908667099465907 0.0032705233218896378
PM25_Tyre 0.00015482685366526594 0.6211541868187214 0.012442943930809379

SVM Regression¶

In [ ]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
In [ ]:
# model set for each pollutant
SVR_set = {}

for k, v in train_test_set.items():
    model = SVR(kernel='linear', C=1, gamma=1)
    model.fit(v['x_train'], v['y_train'])
    y_pred = model.predict(v['x_test'])
    mse = mean_squared_error(v['y_test'], y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(v['y_test'], y_pred)
    SVR_set[k] = {'model': model, 'y_pred': y_pred, 'mse': mse,
                  'r2': r2, 'rmse': rmse,
                  'y_test': v['y_test'], 'x_test': v['x_test'],
                  'x_train': v['x_train'], 'y_train': v['y_train']}
    print(k, mse, r2, rmse)
CO2 205750.76003393464 0.5036054481910563 453.59757498683194
NOx 1.522708912356757 0.5305243947962537 1.2339809205805239
PM10_Brake 0.003126316147087352 0.36702293573558564 0.055913470175686215
PM10_Exhaust 0.004618347885348237 -1.310159988234227 0.06795842762563181
PM10_Resusp 0.010696433148599308 0.4405743562433685 0.10342356186381954
PM10_Tyre 0.005143767056453705 -5.217581450967644 0.07172006034892682
PM25_Brake 0.0008522858713561948 -0.050825810497931556 0.029193935523601385
PM25_Exhaust 0.0044828676360326725 -2.235757377850038 0.06695422044974217
PM25_Resusp 0.004265076974011152 -162.1387727815619 0.06530755679101119
PM25_Tyre 0.005459629273487326 -12.359166338511196 0.07388930418868028

Decision Tree Regression¶

In [ ]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
In [ ]:
# model set for each pollutant
DTR_set = {}

for k, v in train_test_set.items():
    model = DecisionTreeRegressor(max_depth=5, random_state=42)
    model.fit(v['x_train'], v['y_train'])
    y_pred = model.predict(v['x_test'])
    mse = mean_squared_error(v['y_test'], y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(v['y_test'], y_pred)
    DTR_set[k] = {'model': model, 'y_pred': y_pred,
                  'mse': mse, 'r2': r2, 'rmse': rmse,
                    'y_test': v['y_test'], 'x_test': v['x_test'],
                    'x_train': v['x_train'], 'y_train': v['y_train']}
    print(k, mse, r2, rmse)
CO2 26796.95069826198 0.9353496418214142 163.69774188504243
NOx 0.27815509112324527 0.9142403195476974 0.5274041060925154
PM10_Brake 0.0011160096951484228 0.7740444320758149 0.03340673128500337
PM10_Exhaust 0.00012539893120846478 0.9372737609559078 0.011198166421716761
PM10_Resusp 0.0009744026984635491 0.9490385393622965 0.031215424047472896
PM10_Tyre 5.0229914878548736e-05 0.9392840726251969 0.007087306602549994
PM25_Brake 0.00022817528094263225 0.7186713019768844 0.015105471887452978
PM25_Exhaust 9.613658983853144e-05 0.9306081050997367 0.009804926814542342
PM25_Resusp 1.4630324990932855e-06 0.9440391514863675 0.001209558803487158
PM25_Tyre 2.895337285167896e-05 0.9291539947840726 0.005380833843530105

DT Visualisation¶

In [ ]:
for k, v in DTR_set.items():
    fig, ax = plt.subplots(figsize=(120, 12))
    plot_tree(v['model'],
            ax=ax,
            feature_names=v['x_train'].columns,
            fontsize=12,
            filled=True)
    plt.show()

Hyperparameter Tuning¶

In [ ]:
# Grid Search for Decision Tree Regressor
from sklearn.model_selection import GridSearchCV
In [ ]:
for k, v in train_test_set.items():
    model = DecisionTreeRegressor(random_state=42)
    param_grid = {'max_depth': [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17]}
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(v['x_train'], v['y_train'])
    # save the best model
    DTR_set[k]['best_model'] = grid_search.best_estimator_
    # save the best score
    DTR_set[k]['best_score'] = grid_search.best_score_
    # save the best parameters
    DTR_set[k]['best_params'] = grid_search.best_params_
    # save best mse, rmse and r2
    y_pred = grid_search.best_estimator_.predict(v['x_test'])
    mse = mean_squared_error(v['y_test'], y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(v['y_test'], y_pred)
    DTR_set[k]['best_mse'] = mse
    DTR_set[k]['best_rmse'] = rmse
    DTR_set[k]['best_r2'] = r2
    # print best parameters
    print(k, grid_search.best_params_, mse, r2, rmse)
CO2 {'max_depth': 14} 10247.125837086382 0.9752777708505601 101.22808818251178
NOx {'max_depth': 12} 0.15336603560590006 0.9527147888874312 0.39161975895746126
PM10_Brake {'max_depth': 10} 0.0009293047246800024 0.8118460997672856 0.030484499744624353
PM10_Exhaust {'max_depth': 14} 6.154704803656036e-05 0.969213335309999 0.007845192670454968
PM10_Resusp {'max_depth': 16} 0.0002690010521754376 0.9859311898935056 0.01640125154295969
PM10_Tyre {'max_depth': 13} 1.7422481525247925e-05 0.9789403958670143 0.0041740246196264735
PM25_Brake {'max_depth': 9} 0.0001498214702083944 0.8152776279029379 0.01224015809572713
PM25_Exhaust {'max_depth': 13} 4.137020615323757e-05 0.9701387681609122 0.006431967518049013
PM25_Resusp {'max_depth': 16} 3.915724807628567e-07 0.9850223913059635 0.0006257575255343372
PM25_Tyre {'max_depth': 14} 8.304113879230473e-06 0.9796806645562366 0.002881685943892997

Random Forest Regression¶

In [ ]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
In [ ]:
RFR_set = {}

for k, v in train_test_set.items():
    model = RandomForestRegressor(random_state=42)
    model.fit(v['x_train'], v['y_train'])
    y_pred = model.predict(v['x_test'])
    mse = mean_squared_error(v['y_test'], y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(v['y_test'], y_pred)
    RFR_set[k] = {'model': model, 'y_pred': y_pred,
                    'mse': mse, 'r2': r2, 'rmse': rmse,
                    'y_test': v['y_test'], 'x_test': v['x_test'],
                    'x_train': v['x_train'], 'y_train': v['y_train']}
    print(k, mse, r2, rmse)
CO2 5182.412786670413 0.9874969041567384 71.98897684139158
NOx 0.0793740868329035 0.9755276946559032 0.28173407112542054
PM10_Brake 0.0006430227269420954 0.8698088680716739 0.02535789279380476
PM10_Exhaust 2.9690074433202608e-05 0.9851486237706606 0.0054488599204973705
PM10_Resusp 0.00012833779545375515 0.9932879070207226 0.011328627253721218
PM10_Tyre 1.0223574497084213e-05 0.9876421489429809 0.0031974324851487034
PM25_Brake 0.00010593440864047174 0.8693881776520503 0.010292444250054102
PM25_Exhaust 2.619579850298466e-05 0.9810917352113209 0.0051181831251904866
PM25_Resusp 1.9976005031682747e-07 0.9923591978156442 0.0004469452430855792
PM25_Tyre 4.6307604902270805e-06 0.9886689926066657 0.0021519201867697324

Hyperparameter Tuning¶

In [ ]:
param_grid = {'n_estimators': [120, 150, 200]}
for k, v in train_test_set.items():
    model = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(v['x_train'], v['y_train'])
    # save the best model
    RFR_set[k]['best_model'] = grid_search.best_estimator_
    # save the best score
    RFR_set[k]['best_score'] = grid_search.best_score_
    # save the best parameters
    RFR_set[k]['best_params'] = grid_search.best_params_
    # save best mse, rmse and r2
    y_pred = grid_search.best_estimator_.predict(v['x_test'])
    mse = mean_squared_error(v['y_test'], y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(v['y_test'], y_pred)
    RFR_set[k]['best_mse'] = mse
    RFR_set[k]['best_rmse'] = rmse
    RFR_set[k]['best_r2'] = r2
    # print best parameters
    print(k, grid_search.best_params_, mse, r2, rmse)
CO2 {'n_estimators': 200} 5181.916028565403 0.98749810263599 71.98552652141542
NOx {'n_estimators': 200} 0.07917984362339837 0.9755875829560235 0.2813891320278706
PM10_Brake {'n_estimators': 120} 0.0006479354559862355 0.86881420065431 0.02545457632698363
PM10_Exhaust {'n_estimators': 150} 3.0021769435629244e-05 0.9849827054505327 0.005479212483161175
PM10_Resusp {'n_estimators': 150} 0.00012748472250863533 0.9933325229104182 0.011290913271681585
PM10_Tyre {'n_estimators': 200} 9.870728996230191e-06 0.9880686545792372 0.0031417716333671025
PM25_Brake {'n_estimators': 200} 0.00010477083756667516 0.8708228025329586 0.010235762676355639
PM25_Exhaust {'n_estimators': 200} 2.6032444649002596e-05 0.9812096448801173 0.005102199981282838
PM25_Resusp {'n_estimators': 200} 1.982879971088954e-07 0.9924155037053795 0.0004452954043204302
PM25_Tyre {'n_estimators': 200} 4.686965799321414e-06 0.9885314638413071 0.0021649401375838117